from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from langchain_community.document_loaders import UnstructuredEPubLoader

# 需要安装一些依赖
# loader = UnstructuredPDFLoader("data/数字孪生城市白皮书2023年.pdf")

# 按照word文档中的元素进行分割
# loader=UnstructuredWordDocumentLoader("data/category-level.docx",mode="elements")

# 保留epub文件结构
loader = UnstructuredEPubLoader("data/winter-sports.epub", mode="elements")
data=loader.load()
print(data)

